Businesses like banks which provide service have to worry about problem of 'Customer Churn' i.e. customers leaving and joining another service provider. It is important to understand which aspects of the service influence a customer's decision in this regard. Management can concentrate efforts on improvement of service, keeping in mind these priorities.
You as a Data scientist with the bank need to build a neural network based classifier that can determine whether a customer will leave the bank or not in the next 6 months.
CustomerId: Unique ID which is assigned to each customer
Surname: Last name of the customer
CreditScore: It defines the credit history of the customer.
Geography: A customer’s location
Gender: It defines the Gender of the customer
Age: Age of the customer
Tenure: Number of years for which the customer has been with the bank
NumOfProducts: refers to the number of products that a customer has purchased through the bank.
Balance: Account balance
HasCrCard: It is a categorical variable which decides whether the customer has credit card or not.
EstimatedSalary: Estimated salary
isActiveMember: Is is a categorical variable which decides whether the customer is active member of the bank or not ( Active member in the sense, using bank products regularly, making transactions etc )
Exited : whether or not the customer left the bank within six month. It can take two values 0=No ( Customer did not leave the bank ) 1=Yes ( Customer left the bank )
Mounting the Drive
from google.colab import drive
drive.mount('/content/drive')
Loading the required libraries
!pip install keras-tuner
# Libraries to help with reading and manipulating data
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
import keras_tuner
import pandas as pd
import numpy as np
# Library to split data
from sklearn.model_selection import train_test_split
# Library to encode the variables
from sklearn import preprocessing
# To plot confusion matrix
from sklearn.metrics import confusion_matrix
# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# library to import to standardize the data
from sklearn.preprocessing import StandardScaler
#To import different metrics
from sklearn import metrics
from tensorflow.keras import backend
# Library to avoid the warnings
import warnings
warnings.filterwarnings("ignore")
# importing different functions to build models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
#from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf
# importing GridSearch CV
from sklearn.model_selection import GridSearchCV
# importing roc_curve to plot
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# importing SMOTE
from imblearn.over_sampling import SMOTE
# importing metrics
from sklearn import metrics
import random
#Importing classback API
from keras import callbacks
data = pd.read_csv("/content/drive/MyDrive/01 Texas Program/05 Introduction to Neural Networks/Project/Churn.csv")
# Checking the number of rows and columns in the data
data.shape
# let's create a copy of the data
df = data.copy()
# let's view the first 5 rows of the data
df.head()
# let's view the last 5 rows of the data
df.tail()
# let's check the data types of the columns in the dataset
df.info()
Lets check the missing values
df.isna().sum()
Lets convert the columns with an 'object' datatype into categorical variables
for feature in df.columns: # Loop through all columns in the dataframe
if df[feature].dtype == 'object': # Only apply for columns with categorical strings
df[feature] = pd.Categorical(df[feature])# Replace strings with an integer
df.head(10)
# let's check the data types of the columns in the dataset after the conversion
df.info()
# let's check for duplicate values in the data
df.duplicated().sum()
#finding the null values
data.isnull().sum()
Let's check the number of unique values in each column
data.nunique()
Data Cleaning
# Unique values are drop
df.drop(["CustomerId","RowNumber","Surname"], axis=1, inplace=True)
# let's view the statistical summary of the numerical columns in the data
df.describe().T
# let's view the statistical summary of the categorical columns in the data
df.describe(include=["category"]).T
# list of all categorical variables
cat_col = [
"Geography",
"Gender",
]
# printing the number of occurrences of each unique value in each categorical column
for column in cat_col:
print(df[column].value_counts(normalize=True))
print("-" * 50)
Questions:
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a triangle will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
### Function to plot distributions
def distribution_plot_wrt_target(data, predictor, target):
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
target_uniq = data[target].unique()
axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
sns.histplot(
data=data[data[target] == target_uniq[0]],
x=predictor,
kde=True,
ax=axs[0, 0],
color="teal",
)
axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
sns.histplot(
data=data[data[target] == target_uniq[1]],
x=predictor,
kde=True,
ax=axs[0, 1],
color="orange",
)
axs[1, 0].set_title("Boxplot w.r.t target")
sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")
axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
sns.boxplot(
data=data,
x=target,
y=predictor,
ax=axs[1, 1],
showfliers=False,
palette="gist_rainbow",
)
plt.tight_layout()
plt.show()
def outliers(data, feature):
Q1 = data[feature].quantile(0.25) # To find the 25th percentile
Q3 = data[feature].quantile(0.75) # To find the 75th percentile
IQR = Q3 - Q1 # Inter Quantile Range (75th perentile - 25th percentile)
# Finding lower and upper bounds for all values. All values outside these bounds are outliers
lower = (Q1 - 1.5 * IQR)
upper = (Q3 + 1.5 * IQR)
print("lower ",lower)
print("upper ",upper)
labeled_barplot(df, "Geography")
labeled_barplot(df, "Gender")
labeled_barplot(df, "HasCrCard")
labeled_barplot(df, "IsActiveMember")
labeled_barplot(df, "Exited")
histogram_boxplot(df, "CreditScore", kde=True)
outliers(df, "CreditScore")
histogram_boxplot(df, "Age", kde=True)
outliers(df, "Age")
labeled_barplot(df, "Tenure")
labeled_barplot(df, "NumOfProducts")
histogram_boxplot(df, "EstimatedSalary", kde=True)
histogram_boxplot(df, "Balance", kde=True)
plt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
sns.pairplot(data=df, diag_kind="kde", hue="Exited")
plt.show()
Bivariate Analysis
Exited vs CreditScore
distribution_plot_wrt_target(df, "CreditScore", "Exited")
Exited vs Geography
stacked_barplot(df, "Geography", "Exited")
Exited vs Gender
stacked_barplot(df, "Gender", "Exited")
Exited vs Age
stacked_barplot(df, "Age", "Exited")
distribution_plot_wrt_target(df, "Age", "Exited")
Exited vs Tenure
stacked_barplot(df, "Tenure", "Exited")
Exited vs NumOfProducts
stacked_barplot(df, "NumOfProducts", "Exited")
Exited vs Balance
distribution_plot_wrt_target(df, "Balance", "Exited")
Exited vs HasCrCard
stacked_barplot(df, "HasCrCard", "Exited")
Exited vs EstimatedSalary
distribution_plot_wrt_target(df, "EstimatedSalary", "Exited")
Exited vs isActiveMember
stacked_barplot(df, "IsActiveMember", "Exited")
Reply to questions
Split the data in train, validation and test data
# creating the copy of the dataframe
df1 = df.copy()
## Storing required categorical variables in cat_dat to apply dummification
cat_data = ["Gender",'Geography']
X = df1.drop(["Exited"], axis=1)
y = df1["Exited"]
print(X.shape)
print(y.shape)
print(type(X))
y.unique()
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, shuffle=True
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, shuffle=True
)
print(X_train.shape, X_val.shape, X_test.shape)
X_train.head()
Normalizing
The numerical columns present in this dataset are having different units (Years, Money, Score), so scaling would help them all be in the same range.
## Scaling the data
sc=StandardScaler()
temp = sc.fit(X_train[["CreditScore","Age","Tenure","Balance","EstimatedSalary"]])
X_train[["CreditScore","Age","Tenure","Balance","EstimatedSalary"]] = temp.transform(X_train[["CreditScore","Age","Tenure","Balance","EstimatedSalary"]])
X_test[["CreditScore","Age","Tenure","Balance","EstimatedSalary"]] = temp.transform(X_test[["CreditScore","Age","Tenure","Balance","EstimatedSalary"]])
X_val[["CreditScore","Age","Tenure","Balance","EstimatedSalary"]] = temp.transform(X_val[["CreditScore","Age","Tenure","Balance","EstimatedSalary"]])
This is a classification problem, there is no need of scaling the target variable.
Categorical Encoding
# Using pd.get dummies to encode the categorical columns in the data
X_train = pd.get_dummies(X_train,columns=cat_data,drop_first= True)
X_test = pd.get_dummies(X_test,columns=cat_data,drop_first= True)
X_val = pd.get_dummies(X_val,columns=cat_data,drop_first= True)
# Checking the shape
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)
Let's define a function to output different metrics (including recall) on the train and test set and a function to show confusion matrix so that we do not have to use the same code repetitively while evaluating models.
def make_confusion_matrix(cf,
group_names=None,
categories='auto',
count=True,
percent=True,
cbar=True,
xyticks=True,
xyplotlabels=True,
sum_stats=True,
figsize=None,
cmap='Blues',
title=None):
'''
This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
Arguments
'''
# CODE TO GENERATE TEXT INSIDE EACH SQUARE
blanks = ['' for i in range(cf.size)]
if group_names and len(group_names)==cf.size:
group_labels = ["{}\n".format(value) for value in group_names]
else:
group_labels = blanks
if count:
group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
else:
group_counts = blanks
if percent:
group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
else:
group_percentages = blanks
box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])
# CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
if sum_stats:
#Accuracy is sum of diagonal divided by total observations
accuracy = np.trace(cf) / float(np.sum(cf))
# SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
if figsize==None:
#Get default figure size if not set
figsize = plt.rcParams.get('figure.figsize')
if xyticks==False:
#Do not show categories if xyticks is False
categories=False
# MAKE THE HEATMAP VISUALIZATION
plt.figure(figsize=figsize)
sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)
if title:
plt.title(title)
Early stopping:
In case of overfitting the early stopping function will be added. During training, the model is evaluated on a holdout validation dataset after each epoch. If the performance of the model on the validation dataset starts to degrade or no improvement (e.g. loss begins to increase or accuracy begins to decrease), then the training process is stopped after the certian interations.The model at the time that training is stopped is then used and is known to have good generalization performance.
This procedure is called “early stopping” and is perhaps one of the oldest and most widely used forms of neural network regularization.
# Defining Early stopping
es_cb = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5)
backend.clear_session()
#Fixing the seed for random number generators so that we can ensure we receive the same output everytime
np.random.seed(42)
import random
random.seed(42)
tf.random.set_seed(42)
# Initializing the ANN
model_SGD = Sequential()
# The amount of nodes (dimensions) in hidden layer should be the average of input and output layers, in this case 64.
# This adds the input layer (by specifying input dimension) AND the first hidden layer (units)
model_SGD.add(Dense(activation = 'relu', input_dim = 11, units=64))
#Add 1st hidden layer
model_SGD.add(Dense(32, activation='relu'))
# Adding the output layer
# Notice that we do not need to specify input dim.
# we have an output of 1 node, which is the the desired dimensions of our output (stay with the bank or not)
# We use the sigmoid because we want probability outcomes
model_SGD.add(Dense(1, activation = 'sigmoid'))
# Create optimizer with default learning rate
# Compile the model
model_SGD.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['accuracy'])
# Model summary
model_SGD.summary()
history_SGD=model_SGD.fit(X_train, y_train,
validation_data=(X_val,y_val),
epochs=50,
batch_size=32,verbose=1)
Loss function
# Capturing learning history per epoch
hist = pd.DataFrame(history_SGD.history)
hist['epoch'] = history_SGD.epoch
# Plotting accuracy at different epochs
plt.plot(hist['loss'])
plt.plot(hist['val_loss'])
plt.legend(("train" , "valid") , loc =0)
#Printing results
results = model_SGD.evaluate(X_test, y_test)
The validation data is performing well respect the training data. Anyhow, the model tends to overfit.
Let's check the other metrices.
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# predict probabilities
yhat_SGD = model_SGD.predict(X_test)
# keep probabilities for the positive outcome only
yhat_SGD = yhat_SGD[:, 0]
# calculate roc curves
fpr, tpr, thresholds_SGD = roc_curve(y_test, yhat_SGD)
# calculate the g-mean for each threshold
gmeans_SGD = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans_SGD)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds_SGD[ix], gmeans_SGD[ix]))
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.')
pyplot.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()
Confusion matrix
y_pred_SGD=model_SGD.predict(X_test)
y_pred_SGD = (y_pred_SGD > thresholds_SGD[ix])
y_pred_SGD
#Calculating the confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
cm_SGD=confusion_matrix(y_test, y_pred_SGD)
labels = ['True Positive','False Negative','False Positive','True Negative']
categories = [ 'Not Exited','Exited']
make_confusion_matrix(cm_SGD,
group_names=labels,
categories=categories,
cmap='Blues')
#Accuracy as per the classification report
from sklearn import metrics
cr_SGD=metrics.classification_report(y_test,y_pred_SGD)
print(cr_SGD)
The above model has:
Let's see if we can improve this value.
backend.clear_session()
#Fixing the seed for random number generators so that we can ensure we receive the same output everytime
np.random.seed(42)
import random
random.seed(42)
tf.random.set_seed(42)
# Initializing the ANN
model_ADAM = Sequential()
# The amount of nodes (dimensions) in hidden layer should be the average of input and output layers, in this case 64.
# This adds the input layer (by specifying input dimension) AND the first hidden layer (units)
model_ADAM.add(Dense(activation = 'relu', input_dim = 11, units=64))
#Add 1st hidden layer
model_ADAM.add(Dense(32, activation='relu'))
# Adding the output layer
# Notice that we do not need to specify input dim.
# we have an output of 1 node, which is the the desired dimensions of our output (stay with the bank or not)
# We use the sigmoid because we want probability outcomes
model_ADAM.add(Dense(1, activation = 'sigmoid'))
#Compiling the ANN with Adam optimizer and binary cross entropy loss function
optimizer = tf.keras.optimizers.Adam(0.001)
model_ADAM.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
model_ADAM.summary()
history_ADAM=model_ADAM.fit(X_train, y_train,
validation_data=(X_val,y_val),
epochs=50,
batch_size=32,verbose=1)
# Capturing learning history per epoch
hist = pd.DataFrame(history_ADAM.history)
hist['epoch'] = history_ADAM.epoch
# Plotting accuracy at different epochs
plt.plot(hist['loss'])
plt.plot(hist['val_loss'])
plt.legend(("train" , "valid") , loc =0)
#Printing results
results = model_ADAM.evaluate(X_test, y_test)
Here the model seems to be overfitted, as the distance between the train curve and validation curved started increasing. The earlystop function will be used
# Initializing the ANN
model_ADAM_es = Sequential()
model_ADAM_es.add(Dense(activation = 'relu', input_dim = 11, units=64))
#Add 1st hidden layer
model_ADAM_es.add(Dense(32, activation='relu'))
# We use the sigmoid because we want probability outcomes
model_ADAM_es.add(Dense(1, activation = 'sigmoid'))
#Compiling the ANN with Adam optimizer and binary cross entropy loss function
optimizer = tf.keras.optimizers.Adam(0.001)
model_ADAM_es.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
history_ADAM_es=model_ADAM_es.fit(X_train, y_train,
validation_data=(X_val,y_val),
epochs=50,
batch_size=32,verbose=1, callbacks=[es_cb])
# Capturing learning history per epoch
hist = pd.DataFrame(history_ADAM_es.history)
hist['epoch'] = history_ADAM_es.epoch
# Plotting accuracy at different epochs
plt.plot(hist['loss'])
plt.plot(hist['val_loss'])
plt.legend(("train" , "valid") , loc =0)
#Printing results
results = model_ADAM_es.evaluate(X_test, y_test)
Even using earlystop to prevent overfitting the train and valid curves tend to overfit.
from sklearn.metrics import roc_curve
from matplotlib import pyplot
# predict probabilities
yhat_ADAM = model_ADAM_es.predict(X_test)
# keep probabilities for the positive outcome only
yhat_ADAM = yhat_ADAM[:, 0]
# calculate roc curves
fpr, tpr, thresholds_ADAM = roc_curve(y_test, yhat_ADAM)
# calculate the g-mean for each threshold
gmeans_ADAM = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans_ADAM)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds_ADAM[ix], gmeans_ADAM[ix]))
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.')
pyplot.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()
#Predicting the results using best as a threshold
y_pred_ADAM=model_ADAM_es.predict(X_test)
y_pred_ADAM = (y_pred_ADAM > thresholds_ADAM[ix])
y_pred_ADAM
#Calculating the confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
cm_ADAM=confusion_matrix(y_test, y_pred_ADAM)
labels = ['True Positive','False Negative','False Positive','True Negative']
categories = [ 'Not Exited','Exited']
make_confusion_matrix(cm_ADAM,
group_names=labels,
categories=categories,
cmap='Blues')
The false positives got a reduction but the percentage of true positives is less than the percentage obtained for SGD model training.
#Accuracy as per the classification report
from sklearn import metrics
cr_ADAM=metrics.classification_report(y_test,y_pred_ADAM)
print(cr_ADAM)
The Adam optimizer model has:
Let's see if we can improve this value.
backend.clear_session()
#Fixing the seed for random number generators so that we can ensure we receive the same output everytime
np.random.seed(42)
import random
random.seed(42)
tf.random.set_seed(42)
# Initializing the ANN
model_DOUT = Sequential()
# The amount of nodes (dimensions) in hidden layer should be the average of input and output layers, in this case 64.
# This adds the input layer (by specifying input dimension) AND the first hidden layer (units)
model_DOUT.add(Dense(activation = 'relu', input_dim = 11, units=64))
model_DOUT.add(Dropout(0.2))
#Add 1st hidden layer
model_DOUT.add(Dense(32, activation='relu'))
model_DOUT.add(Dropout(0.2))
# Adding the output layer
# Notice that we do not need to specify input dim.
# we have an output of 1 node, which is the the desired dimensions of our output (stay with the bank or not)
# We use the sigmoid because we want probability outcomes
model_DOUT.add(Dense(1, activation = 'sigmoid'))
#Compiling the ANN with Adam optimizer and binary cross entropy loss function
optimizer = tf.keras.optimizers.Adam(0.001)
model_DOUT.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
model_DOUT.summary()
history_DOUT=model_DOUT.fit(X_train, y_train,
validation_data=(X_val,y_val),
epochs=50,
batch_size=32,verbose=1)
# Capturing learning history per epoch
hist = pd.DataFrame(history_DOUT.history)
hist['epoch'] = history_DOUT.epoch
# Plotting accuracy at different epochs
plt.plot(hist['loss'])
plt.plot(hist['val_loss'])
plt.legend(("train" , "valid") , loc =0)
#Printing results
results = model_DOUT.evaluate(X_test, y_test)
Here the model seems to be overfitted, as the distance between the train curve and validation curved started increasing. The earlystop function will be used
backend.clear_session()
#Fixing the seed for random number generators so that we can ensure we receive the same output everytime
np.random.seed(42)
import random
random.seed(42)
tf.random.set_seed(42)
# Initializing the ANN and adding dropout ratio of 0.2
model_DOUT_es = Sequential()
model_DOUT_es.add(Dense(activation = 'relu', input_dim = 11, units=64))
model_DOUT_es.add(Dropout(0.2))
#Add 1st hidden layer
model_DOUT_es.add(Dense(32, activation='relu'))
model_DOUT_es.add(Dropout(0.2))
# Adding the output layer
model_DOUT_es.add(Dense(1, activation = 'sigmoid'))
#Compiling the ANN with Adam optimizer and binary cross entropy loss function
optimizer = tf.keras.optimizers.Adam(0.001)
model_DOUT_es.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
history_DOUT_es=model_DOUT_es.fit(X_train, y_train,
validation_data=(X_val,y_val),
epochs=50,
batch_size=32,verbose=1, callbacks=[es_cb])
# Capturing learning history per epoch
hist = pd.DataFrame(history_DOUT_es.history)
hist['epoch'] = history_DOUT_es.epoch
# Plotting accuracy at different epochs
plt.plot(hist['loss'])
plt.plot(hist['val_loss'])
plt.legend(("train" , "valid") , loc =0)
#Printing results
results = model_DOUT_es.evaluate(X_test, y_test)
# predict probabilities
yhat_DOUT = model_DOUT_es.predict(X_test)
# keep probabilities for the positive outcome only
yhat_DOUT = yhat_DOUT[:, 0]
# calculate roc curves
fpr, tpr, thresholds_DOUT = roc_curve(y_test, yhat_DOUT)
# calculate the g-mean for each threshold
gmeans_DOUT = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans_DOUT)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds_DOUT[ix], gmeans_DOUT[ix]))
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.')
pyplot.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()
#Predicting the results using best as a threshold
y_pred_DOUT=model_DOUT_es.predict(X_test)
y_pred_DOUT = (y_pred_DOUT > thresholds_DOUT[ix])
y_pred_DOUT
#Calculating the confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
cm_DOUT=confusion_matrix(y_test, y_pred_DOUT)
labels = ['True Positive','False Negative','False Positive','True Negative']
categories = [ 'Not Exited','Exited']
make_confusion_matrix(cm_DOUT,
group_names=labels,
categories=categories,
cmap='Blues')
#Accuracy as per the classification report
from sklearn import metrics
cr_DOUT=metrics.classification_report(y_test,y_pred_DOUT)
print(cr_DOUT)
The Neural Network model with dropout has:
This is the best G-mean so far.
Let's see if we can improve this value.
Some important hyperparameters to look out for while optimizing neural networks are:
Type of Architecture
Number of Layers
Number of Neurons in a layer
Regularization hyperparameters
Learning Rate
Type of Optimizer
Dropout Rate
We are using Random search to optimize the following hyperparameters:
backend.clear_session()
np.random.seed(42)
import random
random.seed(42)
tf.random.set_seed(42)
def build_model(h):
model = keras.Sequential()
#let's try 1 to 5 layers
#let's try units between 32 and 128
for i in range(h.Int('num_layers', 1, 5)):
model.add(layers.Dense(units=h.Int('units_' + str(i),
min_value=32,
max_value=128,
step=32),
activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
#let's try dropout 0.25
if h.Boolean("dropout"):
model.add(layers.Dropout(rate=0.25))
#let's try learning rate
model.compile(
optimizer=keras.optimizers.Adam(
h.Choice('learning_rate', [1e-1, 1e-2, 1e-3])),
loss='binary_crossentropy',
metrics=['accuracy'])
return model
tuner = RandomSearch(
build_model,
objective="val_accuracy",
# Let's use 15 trials
max_trials=25,
executions_per_trial=1,
project_name='Job_',
)
tuner.search_space_summary()
### Searching the best model on X and y train
tuner.search(X_train, y_train,
epochs=15,
validation_split = 0.2)
## Printing the best models with their hyperparameters
tuner.results_summary()
Let's create the a model with the best mentioned configuration:
backend.clear_session()
np.random.seed(42)
import random
random.seed(42)
tf.random.set_seed(42)
model_RS = Sequential()
model_RS.add(Dense(128,activation='relu',input_dim = 11))
model_RS.add(Dense(32,activation='relu'))
model_RS.add(Dense(32,activation='relu'))
model_RS.add(Dense(32,activation='relu'))
model_RS.add(Dense(32,activation='relu'))
model_RS.add(Dense(1, activation = 'sigmoid'))
model_RS.summary()
optimizer = tf.keras.optimizers.Adam(0.01)
model_RS.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
history_RS = model_RS.fit(X_train,y_train,batch_size=64,epochs=50,verbose=1,validation_data=(X_val,y_val), callbacks=[es_cb])
#Plotting Train Loss vs Validation Loss
plt.plot(history_RS.history['loss'])
plt.plot(history_RS.history['val_loss'])
plt.title('model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
There is no to much improvement respect the previous models, and we start to see noise in the model.
# predict probabilities
yhat_RS = model_RS.predict(X_test)
# keep probabilities for the positive outcome only
yhat_RS = yhat_RS[:, 0]
# calculate roc curves
fpr, tpr, thresholds_RS = roc_curve(y_test, yhat_RS)
# calculate the g-mean for each threshold
gmeans_RS = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans_RS)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds_RS[ix], gmeans_RS[ix]))
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.')
pyplot.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()
y_pred_RS=model_RS.predict(X_test)
y_pred_RS = (y_pred_RS > thresholds_RS[ix])
y_pred_RS
#Calculating the confusion matrix
from sklearn.metrics import confusion_matrix
cm_RS=confusion_matrix(y_test, y_pred_RS)
labels = ['True Positive','False Negative','False Positive','True Negative']
categories = [ 'Not Changing Job','Changing Job']
make_confusion_matrix(cm_RS,
group_names=labels,
categories=categories,
cmap='Blues')
#Accuracy as per the classification report
from sklearn import metrics
cr_RS=metrics.classification_report(y_test,y_pred_RS)
print(cr_RS)
The Random Search with Keras tuning model has:
##Applying SMOTE on train and test
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy='not majority')
X_sm , y_sm = smote.fit_resample(X_train,y_train)
print("Before Oversampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Oversampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Oversampling, counts of label 'Yes': {}".format(sum(y_sm == 1)))
print("After Oversampling, counts of label 'No': {} \n".format(sum(y_sm == 0)))
print("After Oversampling, the shape of train_X: {}".format(X_sm.shape))
print("After Oversampling, the shape of train_y: {} \n".format(y_sm.shape))
We are using Random search to optimize the following hyperparameters:
Build a model with the balanced dataset
backend.clear_session()
np.random.seed(42)
import random
random.seed(42)
tf.random.set_seed(42)
def build_model_2(h):
model = keras.Sequential()
for i in range(h.Int('num_layers', 1, 5)):
model.add(layers.Dense(units=h.Int('units_' + str(i),
min_value=32,
max_value=128,
step=32),
activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(
optimizer=keras.optimizers.Adam(
h.Choice('learning_rate', [1e-1, 1e-2, 1e-3])),
loss='binary_crossentropy',
metrics=['accuracy'])
return model
tuner_2 = RandomSearch(
build_model_2,
objective='val_accuracy',
max_trials=15,
executions_per_trial=1,
project_name='Job_Switch')
tuner_2.search_space_summary()
tuner_2.search(X_sm, y_sm,
epochs=15,
validation_data=(X_val,y_val))
tuner_2.results_summary()
Let's try a model with the best hyperparameters obtained
backend.clear_session()
np.random.seed(42)
import random
random.seed(42)
tf.random.set_seed(42)
model_sm = Sequential()
model_sm.add(Dense(64,activation='relu',input_dim = 11))
model_sm.add(Dense(128,activation='relu'))
model_sm.add(Dense(1, activation = 'sigmoid'))
model_sm.summary()
optimizer = tf.keras.optimizers.Adam(0.1)
model_sm.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
history_sm = model_sm.fit(X_sm,y_sm,batch_size=64,epochs=50,verbose=1,validation_split = 0.2, callbacks=[es_cb])
#Plotting Train Loss vs Validation Loss
plt.plot(history_sm.history['loss'])
plt.plot(history_sm.history['val_loss'])
plt.title('model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
The balance strategy produced a lot of noise in the model
# predict probabilities
yhat_sm = model_sm.predict(X_test)
# keep probabilities for the positive outcome only
yhat_sm = yhat_sm[:, 0]
# calculate roc curves
fpr, tpr, thresholds_sm = roc_curve(y_test, yhat_sm)
# calculate the g-mean for each threshold
gmeans_sm = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans_sm)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds_sm[ix], gmeans_sm[ix]))
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.')
pyplot.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()
y_pred_sm=model_sm.predict(X_test)
y_pred_sm = (y_pred_sm > thresholds_sm[ix])
y_pred_sm
#Calculating the confusion matrix
from sklearn.metrics import confusion_matrix
cm_sm=confusion_matrix(y_test, y_pred_sm)
labels = ['True Positive','False Negative','False Positive','True Negative']
categories = [ 'Not Changing Job','Changing Job']
make_confusion_matrix(cm_sm,
group_names=labels,
categories=categories,
cmap='Blues')
#Accuracy as per the classification report
from sklearn import metrics
cr_sm=metrics.classification_report(y_test,y_pred_sm)
print(cr_sm)
Oversampling using smote did not help improve the recall score.
The balanced model has:
What recommedations would you suggest to the bank?